
# Copyright 2016 Nervana Systems Inc. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

[-
our ($type, $D);
our $determ = $D;
our $convert_in   = $type eq 'h' ? 'F2F.F32.F16' :    '';
our $convert_out  = $type eq 'h' ? 'F2F.F16.F32' :    '';
our $dtype_shift  = $type eq 'h' ?           '1' :   '2';
our $dtype_size   = $type eq 'h' ?           '2' :   '4';
our $vec_size     = $type eq 'h' ?          '64' : '128';
sub dtype_shift { return $dtype_shift; }
sub vec_size    { return $vec_size;    }
sub output_op   { return $determ ? 'STG.E.CG' : 'RED.E.ADD.F32.FTZ.RN'; }
-]

<CONSTANT_MAPPING>

    addr_zero   : 4x<32*36*2*4 + 64 + 0>
    addr_rYXN   : 4x<32*36*2*4 + 64 + 4>
    addr_iYXN   : 4x<32*36*2*4 + 64 + 5>
    addr_idx_K  : 4x<32*36*2*4 + 64 + 6>
    addr_idx_C  : 4x<32*36*2*4 + 64 + 7>

    param_F[0]         : c[0x0][0x140]
    param_F[1]         : c[0x0][0x144]
    param_I[0]         : c[0x0][0x148]
    param_I[1]         : c[0x0][0x14c]
    param_E[0]         : c[0x0][0x150]
    param_E[1]         : c[0x0][0x154]
    param_alpha        : c[0x0][0x158]
    param_K            : c[0x0][0x15c]
    param_C            : c[0x0][0x160]
    param_k            : c[0x0][0x164]
    param_c            : c[0x0][0x168]
    param_kc           : c[0x0][0x16c]
    param_magic_kc     : c[0x0][0x170]
    param_shift_kc     : c[0x0][0x174]
    param_magic_c      : c[0x0][0x178]
    param_shift_c      : c[0x0][0x17c]
    param_YXN2         : c[0x0][0x180]
    param_sYXN         : c[0x0][0x184]
    param_magic_sYXN   : c[0x0][0x188]
    param_shift_sYXN   : c[0x0][0x18c]
    param_stride_YXNp  : c[0x0][0x190]
    param_YXN          : c[0x0][0x194]
    param_YXN_1152     : c[0x0][0x198]
    param_RSK          : c[0x0][0x19c]
    param_CRSK         : c[0x0][0x1a0]
    param_Kp           : c[0x0][0x1a4]
    param_SKp          : c[0x0][0x1a8]
    param_RSK15_SK2p   : c[0x0][0x1ac]

</CONSTANT_MAPPING>

<REGISTER_MAPPING>

       0-63 : czero<00-63>

     3, 2,11,10 : clx<0-3>y0
     7, 6,15,14 : clx<0-3>y1
     1, 0, 9, 8 : clx<0-3>y2
     5, 4,13,12 : clx<0-3>y3
    19,18,27,26 : clx<0-3>y4
    23,22,31,30 : clx<0-3>y5
    17,16,25,24 : clx<0-3>y6
    21,20,29,28 : clx<0-3>y7

      32-43 : jl0Fx<0-3>, jl0Iy<0-7>
      44-51 : jl1Fx<0-3>, jl1Iy<4-7>
      36-39 : jl1Iy<0-3>

      52-87 : T0<0-3>, T1<0-3>, T2<0-3>, T3<0-3>, T4<0-3>, T5<0-3>, T6<0-3>, T7<0-3>, T8<0-3>
      88-89 : track<0-1>
      90-91 ~ writeS

      32-86 ~ idx_YXNkc, idx_K, idx_C, idx_YXN, div<1-3>, magic_kc, neg_kc, idx_kc, idx_k, idx_c, YXN2_idx, neg_sYXN, magic_sYXN, remainder, yxn, offset, offset2, tid32_2, tid1, tid31
         87 = tid

      32-39 : shuffle16_x<0-3>y0, shuffle16_x<0-3>y1
      48-91 ~ Tid, Tid1, Tid32_2, write16Cs, alpha16


     3, 2,11,10,19,18,27,26 : ccx<0-7>y0
     7, 6,15,14,23,22,31,30 : ccx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : ccx<0-7>y2
     5, 4,13,12,21,20,29,28 : ccx<0-7>y3
    35,34,43,42,51,50,59,58 : ccx<0-7>y4
    39,38,47,46,55,54,63,62 : ccx<0-7>y5
    33,32,41,40,49,48,57,56 : ccx<0-7>y6
    37,36,45,44,53,52,61,60 : ccx<0-7>y7

      64-79 : jc0Fx<0-7>, jc0Iy<0-7>
      80-91 : jc1Fx<4-7>, jc1Iy<0-7>
      64-67 : jc1Fx<0-3>

      64-86 ~ tid16, tid_1, tid128

      92-95 ~ reduce_YXN, swapBuf, readFs, readIs


      64-89 ~ tid_128, tid_64, tid_32, tid_31, tid_16, Tid_1, idxC, idxK, idxI, readFs2, readIs2, offsetF, k, CRSK, xmad_determ
      86-89 : Out1<0-1>, Out2<0-1>
      90-91 : Out0<0-1>
      92-95 ~ alpha, writeCs, readCs, c

      64-79 : shuffle_x<0-7>y0, shuffle_x<0-7>y1

      84-85 ~ t<0-1>

       3, 2,11,19,10,18 : m<0-5>0
       1, 9, 0, 8,17,16 : m<0-5>1
      27,26,25,24,64,65 : m<0-5>2
      66,67,68,69,70,71 : m<0-5>3
      72,73,74,75,76,77 : m<0-5>4
      78,79,80,81,82,83 : m<0-5>5

       3, 2,11 : w00, w10, w20
       1, 9, 0 : w01, w11, w21
      27,26,25 : w02, w12, w22
      66,67,68 : w03, w13, w23
      72,73,74 : w04, w14, w24
      78,79,80 : w05, w15, w25

      19,10,18,69,70,71 ~ s00, s10, s20
       8,17,16,75,76,77 ~ s02, s12, s22
      24,64,65,81,82,83 ~ s01, s11, s21

</REGISTER_MAPPING>

--:-:-:-:0      MOV swapBuf, 4x<32*36*2*2>;
--:-:1:-:1      S2R tid, SR_TID.X;
--:-:-:-:1      STS.128 [addr_zero], RZ;
01:-:-:Y:d      ISETP.GE.AND P0, PT, tid, 128, PT;
--:-:-:-:5  @P0 BRA.U COMPUTE_SETUP;

##############################################################
LOAD_SETUP:

--:-:1:-:1      S2R idx_YXNkc, SR_CTAID.X;
--:-:2:-:1      S2R idx_K,     SR_CTAID.Z;
--:-:3:-:1      S2R idx_C,     SR_CTAID.Y;

<SCHEDULE_BLOCK>

[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..7; +]

--:-:-:-:1      ISETP.EQ.AND P0, PT, tid, RZ, PT;
--:-:-:-:1      ISETP.GE.AND P1, PT, tid, 64, PT;

// idx_YXN = idx_YXNkc / blk_kc
--:-:-:-:1      MOV  magic_kc, param_magic_kc;
--:-:-:-:1      IADD neg_kc, RZ, -param_kc;
--:-:-:-:1      ISETP.NE.AND P2, PT, magic_kc, 1, PT;
01:-:-:-:1  @P2 XMAD     div1, idx_YXNkc,    magic_kc,    RZ;
--:-:-:-:1  @P2 XMAD     div2, idx_YXNkc,    magic_kc.H1, RZ;
--:-:-:-:1  @P2 XMAD     div3, idx_YXNkc.H1, magic_kc.H1, RZ;
--:-:-:-:1  @P2 XMAD.CHI div1, idx_YXNkc.H1, magic_kc,    div1;
--:-:-:-:1  @P2 IADD3.RS idx_YXN, div1, div2, div3;
--:-:-:-:1  @P2 SHR.U32  idx_YXN, idx_YXN,   param_shift_kc;
--:-:-:-:1 @!P2 SHR.U32  idx_YXN, idx_YXNkc, param_shift_kc;

// idx_kc = idx_YXNkc % blk_kc
--:-:-:-:1      XMAD.LO2 idx_kc, neg_kc, idx_YXN, idx_YXNkc;

// idx_k = idx_kc / blk_c
// idx_c = idx_kc % blk_c
--:-:-:-:1      XMAD    idx_k,  idx_kc, param_magic_c, RZ;
--:-:-:-:1      SHR.U32 idx_k,  idx_k,  param_shift_c;
--:-:-:-:1      XMAD    idx_c,  idx_k,  param_c, RZ;
--:-:-:-:1      IADD    idx_c, -idx_c,  idx_kc;

// idx_K = idx_K * blk_k + idx_k
// idx_C = idx_C * blk_c + idx_c
02:-:-:-:1      XMAD idx_K, idx_K, param_k, idx_k;
04:-:-:-:1      XMAD idx_C, idx_C, param_c, idx_c;

// reduce_YXN  = ceil((YXN2 - idx_YXN) / sYXN)
--:-:-:-:1      IADD YXN2_idx, -idx_YXN, param_YXN2;
--:-:-:-:1      IADD neg_sYXN, RZ, -param_sYXN;
--:-:-:-:1      MOV  magic_sYXN, param_magic_sYXN;
--:-:-:-:1      ISETP.NE.AND P3, PT, magic_sYXN, 1, PT;
--:-:-:-:1  @P3 XMAD     div1, YXN2_idx,    magic_sYXN,    RZ;
--:-:-:-:1  @P3 XMAD     div2, YXN2_idx,    magic_sYXN.H1, RZ;
--:-:-:-:1  @P3 XMAD     div3, YXN2_idx.H1, magic_sYXN.H1, RZ;
--:-:-:-:1  @P3 XMAD.CHI div1, YXN2_idx.H1, magic_sYXN,    div1;
--:-:-:-:1  @P3 IADD3.RS reduce_YXN, div1, div2, div3;
--:-:-:-:1  @P3 SHR.U32  reduce_YXN, reduce_YXN, param_shift_sYXN;
--:-:-:-:1 @!P3 SHR.U32  reduce_YXN, YXN2_idx,   param_shift_sYXN;

--:-:-:-:1      XMAD.LO2  remainder, neg_sYXN, reduce_YXN, YXN2_idx;
--:-:-:-:1      IMNMX.U32 remainder, remainder, 1, PT;
--:-:-:-:1      IADD reduce_YXN, reduce_YXN, remainder;

--:-:-:-:1  @P0 STS [addr_iYXN],  idx_YXN;
--:-:-:-:1  @P0 STS [addr_idx_K], idx_K;
--:-:-:-:1  @P0 STS [addr_idx_C], idx_C;
--:6:-:-:1  @P0 STS [addr_rYXN],  reduce_YXN;

// yxn = (tid & 63) >> 5
--:-:-:-:1      BFE.U32 yxn, tid, 0x105; // 1 bit at position 5

// offset = (idx_YXN + (reduce_YXN - 1)*sYXN)*2 + yxn
--:-:-:-:1      IADD     offset,  reduce_YXN, -1;
--:-:-:-:1      XMAD     offset2, offset,    param_sYXN, idx_YXN;
--:-:-:-:1      XMAD.PSL offset2, offset.H1, param_sYXN, offset2;
--:-:-:-:1      ISCADD   offset2, offset2,   yxn, 1;

// P6 = offset < YXN
--:-:-:-:1      ISETP.LT.AND P6, PT, offset2, param_YXN, PT;

// P5 = reduce_YXN > 1
--:-:-:-:1      ISETP.GT.AND P5, PT, reduce_YXN, 1, PT;

--:-:-:-:1      LOP.AND  tid32_2,  tid,    -32;
--:-:-:-:1      SHR.U32  tid32_2,  tid32_2, 2;

// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7)
--:-:-:-:1      BFE.U32 readFs, tid,    0x301; // 3 bits at position 1
--:-:-:-:1      LOP.OR  readFs, readFs, tid32_2;
--:-:-:-:1      ISCADD  readFs, readFs, 4x<32*36*2>, 4;

// readIs = ((tid & -32) >> 2) | ((tid & 16) >> 3) | (tid & 1)
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readIs, tid,    16;
--:-:-:-:1      SHR.U32 readIs, readIs, 3;
--:-:-:-:1      IADD3   readIs, readIs, tid1, tid32_2;
--:-:-:-:1      SHL     readIs, readIs, 4;

// writeS = (yxn*32*36 + (tid & 31)*4)*4
--:-:-:-:1      LOP.AND tid31, tid, 31;
--:-:-:-:1      SHL writeS, tid31, 4;
--:-:-:-:1      XMAD writeS, yxn, 4x<32*36>, writeS;

// offset = offset*32*36 + tid31*4
--:-:-:-:1      SHL tid31, tid31, 2;
--:-:-:-:0      XMAD.LO2 offset, offset2, 1x<32*36>, tid31;
</SCHEDULE_BLOCK>

--:-:-:-:6  @P1 BRA.U FILTER_SETUP;

##############################################################
IMAGE_SETUP:

<SCHEDULE_BLOCK>
// (GC32,GY,GX,N,6,6,32)
// offset += idx_C * YXN*32*36
--:-:-:-:1      XMAD.LO2C offset, idx_C, param_YXN_1152, offset;

--:-:-:-:1      LEA      track0.CC, offset, param_I[0],     [+ dtype_shift() +];
--:-:-:-:0      LEA.HI.X track1,    offset, param_I[1], RZ, [+ dtype_shift() +];
</SCHEDULE_BLOCK>

--:-:-:-:6      BRA.U LOAD;

##############################################################
FILTER_SETUP:

<SCHEDULE_BLOCK>
// writeS += 32*36*2*4
--:-:-:-:1      IADD writeS, writeS, 4x<32*36*2>;

// (GK32,GY,GX,N,6,6,32)
// offset += idx_K * YXN*32*36
--:-:-:-:1      XMAD.LO2C offset, idx_K, param_YXN_1152, offset;

--:-:-:-:1      LEA      track0.CC, offset, param_E[0],     [+ dtype_shift() +];
--:-:-:-:2      LEA.HI.X track1,    offset, param_E[1], RZ, [+ dtype_shift() +];
</SCHEDULE_BLOCK>

##############################################################
LOAD:

20:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>];
--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>];
--:-:2:-:1  @P6 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>];

--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T0, [addr_zero];
--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T1, [addr_zero];
--:-:2:-:1 @!P6 LDS.U.[+ vec_size() +] T2, [addr_zero];

--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>];
--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>];
--:-:3:-:1  @P6 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>];

--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T3, [addr_zero];
--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T4, [addr_zero];
--:-:3:-:1 @!P6 LDS.U.[+ vec_size() +] T5, [addr_zero];

--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>];
--:-:-:-:1  @P6 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>];
--:-:4:-:1  @P6 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>];

--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T6, [addr_zero];
--:-:-:-:1 @!P6 LDS.U.[+ vec_size() +] T7, [addr_zero];
--:-:4:-:1 @!P6 LDS.U.[+ vec_size() +] T8, [addr_zero];

[+
    our $convert_in;
    return $convert_in ? q{

02:-:-:-:1      F2F.F32.F16 T03, T01.H1;
--:-:-:-:1      F2F.F32.F16 T02, T01.H0;
--:-:-:-:1      F2F.F32.F16 T01, T00.H1;
--:-:2:-:1      F2F.F32.F16 T00, T00.H0;

--:-:-:-:1      F2F.F32.F16 T13, T11.H1;
--:-:-:-:1      F2F.F32.F16 T12, T11.H0;
--:-:-:-:1      F2F.F32.F16 T11, T10.H1;
--:-:5:-:1      F2F.F32.F16 T10, T10.H0;

--:-:-:-:1      F2F.F32.F16 T23, T21.H1;
--:-:-:-:1      F2F.F32.F16 T22, T21.H0;
--:-:-:-:1      F2F.F32.F16 T21, T20.H1;
--:-:6:-:1      F2F.F32.F16 T20, T20.H0;

02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;

04:-:-:-:1      F2F.F32.F16 T33, T31.H1;
--:-:-:-:1      F2F.F32.F16 T32, T31.H0;
--:-:-:-:1      F2F.F32.F16 T31, T30.H1;
--:-:3:-:1      F2F.F32.F16 T30, T30.H0;

10:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;

--:-:-:-:1      F2F.F32.F16 T43, T41.H1;
--:-:-:-:1      F2F.F32.F16 T42, T41.H0;
--:-:-:-:1      F2F.F32.F16 T41, T40.H1;
--:-:5:-:1      F2F.F32.F16 T40, T40.H0;

20:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;

--:-:-:-:1      F2F.F32.F16 T53, T51.H1;
--:-:-:-:1      F2F.F32.F16 T52, T51.H0;
--:-:-:-:1      F2F.F32.F16 T51, T50.H1;
--:-:6:-:1      F2F.F32.F16 T50, T50.H0;

04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;

08:-:-:-:1      F2F.F32.F16 T63, T61.H1;
--:-:-:-:1      F2F.F32.F16 T62, T61.H0;
--:-:-:-:1      F2F.F32.F16 T61, T60.H1;
--:-:4:-:1      F2F.F32.F16 T60, T60.H0;

10:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;

--:-:-:-:1      F2F.F32.F16 T73, T71.H1;
--:-:-:-:1      F2F.F32.F16 T72, T71.H0;
--:-:-:-:1      F2F.F32.F16 T71, T70.H1;
--:-:5:-:1      F2F.F32.F16 T70, T70.H0;

20:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;

--:-:-:-:1      F2F.F32.F16 T83, T81.H1;
--:-:-:-:1      F2F.F32.F16 T82, T81.H0;
--:-:-:-:1      F2F.F32.F16 T81, T80.H1;
--:-:6:-:1      F2F.F32.F16 T80, T80.H0;

08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
10:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
20:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;

    } : q{
02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;
--:-:-:-:1      STS.128 [writeS + 4x<1*32*4>], T1;
--:-:-:-:1      STS.128 [writeS + 4x<2*32*4>], T2;
04:-:-:-:1      STS.128 [writeS + 4x<3*32*4>], T3;
--:-:-:-:1      STS.128 [writeS + 4x<4*32*4>], T4;
--:-:-:-:1      STS.128 [writeS + 4x<5*32*4>], T5;
08:-:-:-:1      STS.128 [writeS + 4x<6*32*4>], T6;
--:-:-:-:1      STS.128 [writeS + 4x<7*32*4>], T7;
--:-:-:-:1      STS.128 [writeS + 4x<8*32*4>], T8;
    };
+]

--:-:-:-:0      IADD   track0.CC, track0, -param_stride_YXNp;

--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeS, writeS, swapBuf;
--:-:-:-:1      IADD swapBuf, RZ, -swapBuf;

--:-:-:-:0      IADD.X track1,    track1, -RZ;

--:-:-:-:1      LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];
--:-:-:-:1      LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];
--:-:1:-:1      LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];

--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T0, [track + 4x<0*32 * $dtype_size>];
--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T1, [track + 4x<1*32 * $dtype_size>];
--:-:2:-:1  @P5 LDG.E.[+ vec_size() +] T2, [track + 4x<2*32 * $dtype_size>];
--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T3, [track + 4x<3*32 * $dtype_size>];
--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T4, [track + 4x<4*32 * $dtype_size>];
--:-:3:-:1  @P5 LDG.E.[+ vec_size() +] T5, [track + 4x<5*32 * $dtype_size>];
--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T6, [track + 4x<6*32 * $dtype_size>];
--:-:-:-:1  @P5 LDG.E.[+ vec_size() +] T7, [track + 4x<7*32 * $dtype_size>];
--:6:4:-:1  @P5 LDG.E.[+ vec_size() +] T8, [track + 4x<8*32 * $dtype_size>];

--:-:-:-:5      BRA.U LOAD_LOOP;

##############################################################

COMPUTE_SETUP:

<SCHEDULE_BLOCK>
[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]

--:-:-:-:1      IADD tid128, tid, -128;

// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3)
// readIs = ((tid & -16) >> 1) | ((tid &  8) >> 2) | (tid & 1)
--:-:-:-:1      LOP.AND  tid16,  tid128, -16;
--:-:-:-:1      SHR.U32  tid16,  tid16,   1;

--:-:-:-:1      BFE.U32  readFs, tid128, 0x201; // 2 bits at position 1
--:-:-:-:1      LOP.OR   readFs, readFs, tid16;
--:-:-:-:1      ISCADD   readFs, readFs, 4x<32*4 + 32*36*2>, 4;

--:-:-:-:1      LOP.AND  tid_1,  tid128, 1;
--:-:-:-:1      LOP.AND  readIs, tid128, 8;
--:-:-:-:1      SHR.U32  readIs, readIs, 2;
--:-:-:-:1      IADD3    readIs, readIs, tid16, tid_1;
--:-:-:-:0      ISCADD   readIs, readIs, 4x<32*4>, 4;
</SCHEDULE_BLOCK>

--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      LDS reduce_YXN, [addr_rYXN];

--:-:-:-:1      LDS.U.128 jc0Iy0, [readIs + 4x<0*32*36 + 00>];
--:-:-:-:1      LDS.U.128 jc0Fx0, [readFs + 4x<0*32*36 + 00>];
--:-:-:-:1      LDS.U.128 jc0Iy4, [readIs + 4x<0*32*36 + 16>];
--:-:1:-:2      LDS.U.128 jc0Fx4, [readFs + 4x<0*32*36 + 16>];

COMPUTE_LOOP:
[+
    my %insert = (

        j0c33 => "--:-:-:-:1      ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;\n" .
                 "--:-:-:-:1      IADD reduce_YXN, reduce_YXN, -1;\n",

        j0c62 => "02:-:-:Y:5      BAR.SYNC 0;\n" .
                 "--:-:-:-:1      IADD readIs, readIs, swapBuf;\n" .
                 "--:-:-:-:1      IADD readFs, readFs, swapBuf;\n" .
                 "--:-:-:-:1      IADD swapBuf, RZ,   -swapBuf;\n",

        j1c63 => "--:-:-:Y:5  \@P0 BRA.U COMPUTE_LOOP;\n" .
                 "--:-:-:Y:5      BRA.U COMPUTE_FINISH;\n",
    );
    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }
    my $out;
    foreach my $j (0 .. 1)
    {
        my $odd    = $j;
        my $nOdd   = 1 - $j;
        my $rsPred = $j == 1 ? '@P0' : '   ';
        my $bar    = $j == 0 ? '2' : '-';

        $insert{"j${j}c0"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIy4, [readIs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
        $insert{"j${j}c2"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dFx4, [readFs + 4x<%d*32*36 + 16>];\n", $rsPred, $nOdd, $nOdd;
        $insert{"j${j}c4"}  = sprintf "--:-:-:-:1  %s LDS.U.128 jc%dIy0, [readIs + 4x<%d*32*36 + 00>];\n", $rsPred, $nOdd, $nOdd;

        $insert{"j${j}c31"} = sprintf "--:%s:1:-:1  %s LDS.U.128 jc%dFx0, [readFs + 4x<%d*32*36 + 00>];\n", $bar, $rsPred, $nOdd, $nOdd;


        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;

            my $yield  = $c % 10 == 0 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA ccx%dy%d, jc%dFx%d, jc%dIy%d, ccx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]

LOAD_LOOP:
--:-:-:-:1      ISETP.GT.AND P0, PT, reduce_YXN, 1, PT;
20:-:-:-:1      IADD track0.CC, track0, -param_stride_YXNp;
--:-:-:-:1      ISETP.GT.AND P1, PT, reduce_YXN, 2, PT;
--:-:-:-:1      IADD reduce_YXN, reduce_YXN, -1;
[+
    our ($vec_size, $dtype_size, $convert_in);
    my %insert = (

        j0c3 => "--:-:-:-:1      IADD.X track1, track1, -RZ;\n",

        j0c0  => "--:-:-:-:1      LDS.U.128 jl1Iy4, [readIs + 4x<1*32*36 + 16>];\n",
        j0c2  => "--:-:-:-:1      LDS.U.128 jl1Fx0, [readFs + 4x<1*32*36 + 00>];\n",
        j0c18 => "--:-:1:-:1      LDS.U.128 jl1Iy0, [readIs + 4x<1*32*36 + 00>];\n",

        j1c12 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Iy4, [readIs + 4x<0*32*36 + 16>];\n",
        j1c14 => "--:-:-:-:1  \@P0 LDS.U.128 jl0Fx0, [readFs + 4x<0*32*36 + 00>];\n",
        j1c16 => "--:-:1:-:1  \@P0 LDS.U.128 jl0Iy0, [readIs + 4x<0*32*36 + 00>];\n",

        $convert_in ? (

            j0c1  => "02:-:-:-:1      F2F.F32.F16 T03, T01.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T02, T01.H0;\n",
            j0c4  => "--:-:-:-:1      F2F.F32.F16 T01, T00.H1;\n" .
                     "--:-:2:-:1      F2F.F32.F16 T00, T00.H0;\n",

            j0c5  => "--:-:-:-:1      F2F.F32.F16 T13, T11.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T12, T11.H0;\n",
            j0c6  => "--:-:-:-:1      F2F.F32.F16 T11, T10.H1;\n" .
                     "--:-:5:-:1      F2F.F32.F16 T10, T10.H0;\n",

            j0c7  => "--:-:-:-:1      F2F.F32.F16 T23, T21.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T22, T21.H0;\n",
            j0c8  => "--:-:-:-:1      F2F.F32.F16 T21, T20.H1;\n" .
                     "--:-:6:-:1      F2F.F32.F16 T20, T20.H0;\n",

            j0c9  => "02:2:-:-:1  \@P0 STS.128 [writeS + 4x<0*32*4>], T0;\n",
            j0c10 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
            j0c11 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",

            j0c13 => "02:-:-:-:1  \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n",
            j0c14 => "10:-:-:-:1  \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n",
            j0c15 => "20:-:2:-:1  \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n",

            j0c16 => "04:-:-:-:1      F2F.F32.F16 T33, T31.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T32, T31.H0;\n",
            j0c17 => "--:-:-:-:1      F2F.F32.F16 T31, T30.H1;\n" .
                     "--:-:3:-:1      F2F.F32.F16 T30, T30.H0;\n",

            j0c19 => "--:-:-:-:1      F2F.F32.F16 T43, T41.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T42, T41.H0;\n",
            j0c20 => "--:-:-:-:1      F2F.F32.F16 T41, T40.H1;\n" .
                     "--:-:5:-:1      F2F.F32.F16 T40, T40.H0;\n",

            j0c21 => "--:-:-:-:1      F2F.F32.F16 T53, T51.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T52, T51.H0;\n",
            j0c22 => "--:-:-:-:1      F2F.F32.F16 T51, T50.H1;\n" .
                     "--:-:6:-:1      F2F.F32.F16 T50, T50.H0;\n",

            j0c23 => "04:3:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
            j0c24 => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
            j0c25 => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",

            j0c27 => "04:-:-:-:1  \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n",
            j0c28 => "10:-:-:-:1  \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n",
            j0c29 => "20:-:3:-:1  \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n",

            j0c30 => "08:-:-:-:1      F2F.F32.F16 T63, T61.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T62, T61.H0;\n",
            j0c31 => "--:-:-:-:1      F2F.F32.F16 T61, T60.H1;\n" .
                     "--:-:4:-:1      F2F.F32.F16 T60, T60.H0;\n",

            j1c0  => "--:-:-:-:1      F2F.F32.F16 T73, T71.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T72, T71.H0;\n",
            j1c1  => "--:-:-:-:1      F2F.F32.F16 T71, T70.H1;\n" .
                     "--:-:5:-:1      F2F.F32.F16 T70, T70.H0;\n",

            j1c2  => "--:-:-:-:1      F2F.F32.F16 T83, T81.H1;\n" .
                     "--:-:-:-:1      F2F.F32.F16 T82, T81.H0;\n",
            j1c3  => "--:-:-:-:1      F2F.F32.F16 T81, T80.H1;\n" .
                     "--:-:6:-:1      F2F.F32.F16 T80, T80.H0;\n",

            j1c4  => "08:4:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
            j1c5  => "10:5:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
            j1c6  => "20:6:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",

            j1c8  => "08:-:-:-:1  \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n",
            j1c9  => "10:-:-:-:1  \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n",
            j1c10 => "20:6:4:-:1  \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n",

        ) : (

            j0c6  => "02:-:-:-:1      STS.128 [writeS + 4x<0*32*4>], T0;\n",
            j0c8  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<1*32*4>], T1;\n",
            j0c10 => "--:2:-:-:1  \@P0 STS.128 [writeS + 4x<2*32*4>], T2;\n",

            j0c12 => "02:-:-:-:1  \@P1 LDG.E.$vec_size T0, [track + 4x<0*32 * $dtype_size>];\n",
            j0c14 => "--:-:-:-:1  \@P1 LDG.E.$vec_size T1, [track + 4x<1*32 * $dtype_size>];\n",
            j0c16 => "--:-:2:-:1  \@P1 LDG.E.$vec_size T2, [track + 4x<2*32 * $dtype_size>];\n",

            j0c20 => "04:-:-:-:1  \@P0 STS.128 [writeS + 4x<3*32*4>], T3;\n",
            j0c22 => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<4*32*4>], T4;\n",
            j0c24 => "--:3:-:-:1  \@P0 STS.128 [writeS + 4x<5*32*4>], T5;\n",

            j0c26 => "04:-:-:-:1  \@P1 LDG.E.$vec_size T3, [track + 4x<3*32 * $dtype_size>];\n",
            j0c28 => "--:-:-:-:1  \@P1 LDG.E.$vec_size T4, [track + 4x<4*32 * $dtype_size>];\n",
            j0c30 => "--:-:3:-:1  \@P1 LDG.E.$vec_size T5, [track + 4x<5*32 * $dtype_size>];\n",

            j1c0  => "08:-:-:-:1  \@P0 STS.128 [writeS + 4x<6*32*4>], T6;\n",
            j1c2  => "--:-:-:-:1  \@P0 STS.128 [writeS + 4x<7*32*4>], T7;\n",
            j1c4  => "--:4:-:-:1  \@P0 STS.128 [writeS + 4x<8*32*4>], T8;\n",

            j1c6  => "08:-:-:-:1  \@P1 LDG.E.$vec_size T6, [track + 4x<6*32 * $dtype_size>];\n",
            j1c8  => "--:-:-:-:1  \@P1 LDG.E.$vec_size T7, [track + 4x<7*32 * $dtype_size>];\n",
            j1c10 => "--:6:4:-:1  \@P1 LDG.E.$vec_size T8, [track + 4x<8*32 * $dtype_size>];\n",
        ),

        j1c11 => "--:-:-:Y:5      BAR.SYNC 0;\n" .
                 "--:-:-:-:1  \@P0 IADD readIs, readIs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD readFs, readFs, -swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD writeS, writeS,  swapBuf;\n" .
                 "--:-:-:-:1  \@P0 IADD swapBuf, RZ,    -swapBuf;\n",

        j1c31 => "--:-:-:Y:5  \@P0 BRA.U LOAD_LOOP;\n",
    );

    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    foreach my $xy ([0,0],[0,1],[2,1],[2,0],[2,4],[2,5],[0,5],[0,4])
    {
        my ($x, $y) = @$xy;
        push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
    }
    my $out;
    foreach my $j (0 .. 1)
    {
        foreach my $c (0 .. 31)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $wait   = $c == 0 ? "01" : '--';

            my $stall  = $ins =~ /^[^\n]*(?:LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA)/ ? 0 : 1;

            my $ctrl   = "$wait:-:-:-:$stall";

            $out .= sprintf "%s      FFMA clx%dy%d, jl%dFx%d, jl%dIy%d, clx%dy%d;\n%s", $ctrl,  $x,$y,  $j,$x,  $j,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]

--:-:1:-:2      S2R Tid, SR_TID.X;
<SCHEDULE_BLOCK>
--:-:-:-:1      MOV alpha16, param_alpha;

01:-:-:-:1      LOP.AND  Tid32_2,  Tid,    -32;
--:-:-:-:1      SHR.U32  Tid32_2,  Tid32_2, 2;

// readIs = ((tid & 16) >> 3) | (tid & 1)
--:-:-:-:1      LOP.AND Tid1,   Tid,    1;
01:-:-:-:1      LOP.AND readIs, Tid,    16;
--:-:-:-:1      SHR.U32 readIs, readIs, 3;
--:-:-:-:1      IADD    readIs, readIs, Tid1;

// readFs = ((tid & -32) >> 2) | ((tid >> 1) & 7) | (readIs << 2)
--:-:-:-:1      BFE.U32 readFs, Tid,    0x301; // 3 bits at position 1
--:-:-:-:1      LOP.OR  readFs, readFs, Tid32_2;
--:-:-:-:1      ISCADD  readFs, readIs, readFs, 2;

--:-:-:-:1      SHL     readFs, readFs, 4;
--:-:-:-:1      SHL     readIs, readIs, 3;

// writeCs = readIs * 32*36 + readFs;
--:-:-:-:1      XMAD write16Cs, readIs, 1x<32*36>, readFs;
</SCHEDULE_BLOCK>

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y0, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y2, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y2, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y2, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y2, alpha16;
--:-:-:-:4      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y1, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y3, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y3, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y3, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y3, alpha16;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y4, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y6, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y6, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y6, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y6, alpha16;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:1      FMUL shuffle16_x0y0, clx0y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y0, clx1y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y0, clx2y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x3y0, clx3y5, alpha16;
--:-:-:-:1      FMUL shuffle16_x0y1, clx0y7, alpha16;
--:-:-:-:1      FMUL shuffle16_x1y1, clx1y7, alpha16;
--:-:-:-:1      FMUL shuffle16_x2y1, clx2y7, alpha16;
--:-:-:-:0      FMUL shuffle16_x3y1, clx3y7, alpha16;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      STS.128 [write16Cs+4x<0*32*36 + 00>], shuffle16_x0y0;
--:-:-:-:d      STS.128 [write16Cs+4x<1*32*36 + 00>], shuffle16_x0y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      EXIT;

COMPUTE_FINISH:

--:-:1:-:2      S2R tid_128, SR_TID.X;
<SCHEDULE_BLOCK>

--:-:-:-:1      MOV alpha, param_alpha;

01:-:-:-:1      IADD tid_128, tid_128, -128;

--:-:-:-:1      ISETP.GE.AND P4, PT, tid_128, 256, PT;

// readIs = ((tid &  8) >> 2) | (tid & 1)
--:-:-:-:1      LOP.AND  Tid_1,   tid_128, 1;
--:-:-:-:1      LOP.AND  readIs2, tid_128, 8;
--:-:-:-:1      SHR.U32  readIs2, readIs2, 2;
--:-:-:-:1      IADD     readIs2, readIs2, Tid_1;

// readFs = ((tid & -16) >> 1) | ((tid >> 1) & 3) | (readIs2 << 2)
--:-:-:-:1      LOP.AND  tid_16,   tid_128, -16;
--:-:-:-:1      SHR.U32  tid_16,   tid_16,   1;
--:-:-:-:1      BFE.U32  readFs2,  tid_128,  0x201; // 2 bits at position 1
--:-:-:-:1      LOP.OR   readFs2,  readFs2,  tid_16;
--:-:-:-:1      ISCADD   readFs2,  readIs2, readFs2, 2;

--:-:-:-:1      ISCADD   readFs2, readFs2,  4x<32*4>, 4;
--:-:-:-:1      SHL      readIs2, readIs2, 3;

// writeCs = readFs * 32*36 + readIs;
--:-:-:-:0      XMAD writeCs, readIs2, 1x<32*36>, readFs2;
</SCHEDULE_BLOCK>

--:-:-:-:5  @P4 BRA.U SKIP0;

--:-:1:-:1      LDS idxK, [addr_idx_K];
--:-:2:-:1      LDS idxC, [addr_idx_C];
[+ our $determ; return $determ ? q{--:-:3:-:1      LDS idxI, [addr_iYXN];} : ''; +]

<SCHEDULE_BLOCK>

--:-:-:-:1      LOP.AND tid_31, tid_128, 31;
--:-:-:-:1      SHR.U32 tid_32, tid_128,  5;
--:-:-:-:1      SHR.U32 tid_64, tid_128,  6;

// readCs = tid_32 * 32*36 + tid_31 + tid_64 * 16
--:-:-:-:1      XMAD   readCs, tid_32, 1x<32*36>, tid_31;
--:-:-:-:1      ISCADD readCs, tid_64, readCs, 4;
--:-:-:-:1      SHL    readCs, readCs, 2;

// k = K_blk*32 + tid_31
// c = C_blk*32 + tid_32<<1
--:-:-:-:1      SHL tid_32, tid_32, 1;
01:-:-:-:1      ISCADD  k, idxK, tid_31, 5;
02:-:-:-:1      ISCADD  c, idxC, tid_32, 5;


// offsetF = c*RSK + r*SK + s*K + k
--:-:-:-:1      XMAD.LO2C offsetF, c, param_RSK, k;

[+
    our $determ;
    return $determ ? q{
--:-:-:-:1      MOV CRSK, param_CRSK;
04:-:-:-:1      XMAD.LO offsetF, idxI, CRSK, offsetF, xmad_determ;
    } : '';
+]

--:-:-:-:1      LEA      Out00.CC, offsetF, param_F[0],     2;
--:-:-:-:1      LEA.HI.X Out01,    offsetF, param_F[1], RZ, 2;


--:-:-:-:1      ISETP.LT.AND P0, PT, k, param_K, PT;
</SCHEDULE_BLOCK>

SKIP0:

<SCHEDULE_BLOCK>
--:-:-:-:1      IADD     Out10.CC, Out00, param_Kp;
--:-:-:-:1      IADD.X   Out11,    Out01, RZ;
--:-:-:-:1      IADD     Out20.CC, Out10, param_Kp;
--:-:-:-:1      IADD.X   Out21,    Out11, RZ;

--:-:-:-:1      FMUL shuffle_x0y0, ccx0y0, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y0, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y0, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y0, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y0, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y0, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y0, alpha;
--:-:-:-:1      FMUL shuffle_x7y0, ccx7y0, alpha;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y2, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y2, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y2, alpha;
--:-:-:-:1      FMUL shuffle_x3y1, ccx3y2, alpha;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y2, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y2, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y2, alpha;
--:-:-:-:1      FMUL shuffle_x7y1, ccx7y2, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
</SCHEDULE_BLOCK>
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP1;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
01:-:-:-:5      IADD   Out00.CC, Out00, param_SKp;
--:-:-:-:1      IADD c, c, 1;
--:-:-:-:1      IADD.X Out01,    Out01, RZ;
02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
--:-:-:-:1      IADD.X Out11,    Out11, RZ;
04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
--:-:-:-:1      IADD.X Out21,    Out21, RZ;

SKIP1:

--:-:-:-:0      FMUL shuffle_x0y0, ccx0y1, alpha;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y1, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y1, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y1, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y1, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y1, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y1, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, ccx7y1, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y3, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y3, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y3, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, ccx3y3, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y3, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y3, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y3, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, ccx7y3, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP2;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
01:-:-:-:5      IADD   Out00.CC, Out00, param_RSK15_SK2p;
--:-:-:-:1      IADD c, c, 15;
--:-:-:-:1      IADD.X Out01,    Out01, RZ;
02:-:-:-:6      IADD   Out10.CC, Out10, param_RSK15_SK2p;
--:-:-:-:1      IADD.X Out11,    Out11, RZ;
04:-:-:-:6      IADD   Out20.CC, Out20, param_RSK15_SK2p;
--:-:-:-:1      IADD.X Out21,    Out21, RZ;

SKIP2:

--:-:-:-:0      FMUL shuffle_x0y0, ccx0y4, alpha;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y4, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y4, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y4, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y4, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y4, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y4, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, ccx7y4, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y6, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y6, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y6, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, ccx3y6, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y6, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y6, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y6, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, ccx7y6, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP3;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
01:-:-:-:5      IADD   Out00.CC, Out00, param_SKp;
--:-:-:-:1      IADD c, c, 1;
--:-:-:-:1      IADD.X Out01,    Out01, RZ;
02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
--:-:-:-:1      IADD.X Out11,    Out11, RZ;
04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
--:-:-:-:1      IADD.X Out21,    Out21, RZ;

SKIP3:

--:-:-:-:0      FMUL shuffle_x0y0, ccx0y5, alpha;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      FMUL shuffle_x1y0, ccx1y5, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, ccx2y5, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, ccx3y5, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, ccx4y5, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, ccx5y5, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, ccx6y5, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, ccx7y5, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, ccx0y7, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, ccx1y7, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, ccx2y7, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, ccx3y7, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*32*36 + 16>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, ccx4y7, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, ccx5y7, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, ccx6y7, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, ccx7y7, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<1*32*36 + 00>], shuffle_x0y1;
--:-:-:-:d      STS.128 [writeCs+4x<1*32*36 + 16>], shuffle_x4y1;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5  @P4 BRA.U SKIP4;
--:-:-:-:5      CAL OUTPUT_TRANSFORM;
SKIP4:

--:-:-:-:5      EXIT;

OUTPUT_TRANSFORM:

--:-:-:-:0      ISETP.LT.AND P1, PT, c, param_C, P0;

[+
    my $out;
    foreach my $i (0 .. 5)
    {
        foreach my $j (0 .. 5)
        {
            my $b = $j == 5 ? $i + 1 : '-';
            $out .= "--:-:$b:-:1      LDS m$j$i, [readCs + 4x<($j*6+$i)*32>];\n";
        }
    }
    return $out;
+]
<SCHEDULE_BLOCK>
[+
    my $out;
    foreach my $i (0 .. 5)
    {
        my $w = sprintf "%02x", 1 << $i;
        $out .= qq{
$w:-:-:-:1      FADD t0,   m1$i,  m2$i;
$w:-:-:-:1      FADD t1,   m3$i,  m4$i;
--:-:-:-:1      FADD m1$i, m1$i, -m2$i;
--:-:-:-:1      FADD m3$i, m3$i, -m4$i;
--:-:-:-:1      FADD w0$i, m0$i,  t0;
--:-:-:-:1      FADD w0$i, w0$i,  t1;
--:-:-:-:1      FMUL w1$i, m1$i,  0.625;
--:-:-:-:1      FFMA w1$i, m3$i,  1.5,      w1$i;
--:-:-:-:1      FFMA w2$i, t1,    2.25,     m5$i;
--:-:-:-:1      FFMA w2$i, t0,    0.390625, w2$i;
        };
    }
    return $out;
+]
</SCHEDULE_BLOCK>

<SCHEDULE_BLOCK>
[+
    my $out;
    foreach my $i (0 .. 2)
    {
        $out .= qq{
--:-:-:-:1      FADD t0,     w${i}1,  w${i}2;
--:-:-:-:1      FADD t1,     w${i}3,  w${i}4;
--:-:-:-:1      FADD w${i}1, w${i}1, -w${i}2;
--:-:-:-:1      FADD w${i}3, w${i}3, -w${i}4;
--:-:-:-:1      FADD s${i}0, w${i}0,  t0;
--:-:-:-:1      FADD s${i}0, s${i}0,  t1;
--:-:-:-:1      FMUL s${i}1, w${i}1,  0.625;
--:-:-:-:1      FFMA s${i}1, w${i}3,  1.5,      s${i}1;
--:-:-:-:1      FFMA s${i}2, t1,      2.25,     w${i}5;
--:-:-:-:1      FFMA s${i}2, t0,      0.390625, s${i}2;
        };
    }
    return $out;
+]

//--:-:1:-:1      I2F.F32.S32 temp, c;

<ORDERED>
--:1:-:-:1  @P1 [+ output_op() +] [Out0], s00;
--:2:-:-:1  @P1 [+ output_op() +] [Out1], s01;
--:3:-:-:1  @P1 [+ output_op() +] [Out2], s02;
01:-:-:-:6      IADD   Out00.CC, Out00, param_SKp;
--:-:-:-:1      IADD.X Out01,    Out01, RZ;
02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
--:-:-:-:1      IADD.X Out11,    Out11, RZ;
04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
--:-:-:-:1      IADD.X Out21,    Out21, RZ;
</ORDERED>

<ORDERED>
--:1:-:-:1  @P1 [+ output_op() +] [Out0], s10;
--:2:-:-:1  @P1 [+ output_op() +] [Out1], s11;
--:3:-:-:1  @P1 [+ output_op() +] [Out2], s12;
01:-:-:-:6      IADD   Out00.CC, Out00, param_SKp;
--:-:-:-:1      IADD.X Out01,    Out01, RZ;
02:-:-:-:6      IADD   Out10.CC, Out10, param_SKp;
--:-:-:-:1      IADD.X Out11,    Out11, RZ;
04:-:-:-:6      IADD   Out20.CC, Out20, param_SKp;
--:-:-:-:1      IADD.X Out21,    Out21, RZ;
</ORDERED>

<ORDERED>
--:1:-:-:1  @P1 [+ output_op() +] [Out0], s20;
--:2:-:-:1  @P1 [+ output_op() +] [Out1], s21;
--:3:-:-:1  @P1 [+ output_op() +] [Out2], s22;
</ORDERED>
</SCHEDULE_BLOCK>

--:-:-:-:5      RET;
